#define __CAT__(x, y) x ## _ ## y
#define CAT(x, y) __CAT__(x, y)
INLINE vreal CAT(msm_gamma, ORDER)(vreal rho, vreal *gamma) {
  vreal rho2 = rho * rho;
  const real *gcons = GCONS[ORDER >> 1];
  vreal g = gcons[ORDER >> 1];
  for (int i = (ORDER >> 1) - 1; i >= 0; i --) {
    g = g * rho2 + gcons[i];
  }
}

#define GAMMA_EVAL0(gamma, rho2) ((gamma) * rho2 + gcons0v4)
#define GAMMA_EVAL2(gamma, rho2) GAMMA_EVAL0((gamma) * rho2 + gcons1v4, rho2)
#define GAMMA_EVAL4(gamma, rho2) GAMMA_EVAL2((gamma) * rho2 + gcons2v4, rho2)
#define GAMMA_EVAL6(gamma, rho2) GAMMA_EVAL4((gamma) * rho2 + gcons3v4, rho2)
#define GAMMA_EVAL8(gamma, rho2) GAMMA_EVAL6((gamma) * rho2 + gcons4v4, rho2)
#define GAMMA_EVAL10(gamma, rho2) GAMMA_EVAL8((gamma) * rho2 + gcons5v4, rho2)

#define GAMMA_4(rho2) GAMMA_EVAL2(gcons2v4, rho2)
#define GAMMA_6(rho2) GAMMA_EVAL4(gcons3v4, rho2)
#define GAMMA_8(rho2) GAMMA_EVAL6(gcons4v4, rho2)
#define GAMMA_10(rho2) GAMMA_EVAL8(gcons5v4, rho2)
#define GAMMA_12(rho2) GAMMA_EVAL10(gcons6v4, rho2)

#define DGAMMA_EVAL2(dgamma, rho2) ((dgamma) * rho2 + dgcons0v4)
#define DGAMMA_EVAL4(dgamma, rho2) DGAMMA_EVAL2((dgamma) * rho2 + dgcons1v4, rho2)
#define DGAMMA_EVAL6(dgamma, rho2) DGAMMA_EVAL4((dgamma) * rho2 + dgcons2v4, rho2)
#define DGAMMA_EVAL8(dgamma, rho2) DGAMMA_EVAL6((dgamma) * rho2 + dgcons3v4, rho2)
#define DGAMMA_EVAL10(dgamma, rho2) DGAMMA_EVAL8((dgamma) * rho2 + dgcons4v4, rho2)

#define DGAMMA_4(rho, rho2) (DGAMMA_EVAL2(dgcons1v4, rho2) * rho)
#define DGAMMA_6(rho, rho2) (DGAMMA_EVAL4(dgcons2v4, rho2) * rho)
#define DGAMMA_8(rho, rho2) (DGAMMA_EVAL6(dgcons3v4, rho2) * rho)
#define DGAMMA_10(rho, rho2) (DGAMMA_EVAL8(dgcons4v4, rho2) * rho)
#define DGAMMA_12(rho, rho2) (DGAMMA_EVAL10(dgcons5v4, rho2) * rho)

#if ORDER == 4
#define GCONS0 15.0 / 8.
#define GCONS1 -5.0 / 4.
#define GCONS2 3.0 / 8.
#elif ORDER == 6
#define GCONS0 35.0 / 16.
#define GCONS1 -35.0 / 16.
#define GCONS2 21.0 / 16.
#define GCONS3 -5.0 / 16.
#elif ORDER == 8
#define GCONS0 315.0 / 128.
#define GCONS1 -105.0 / 32.
#define GCONS2 189.0 / 64.
#define GCONS3 -45.0 / 32.
#define GCONS4 35.0 / 128.
#elif ORDER == 10
#define GCONS0 693.0 / 256.
#define GCONS1 -1155.0 / 256.
#define GCONS2 693.0 / 128.
#define GCONS3 -495.0 / 128.
#define GCONS4 385.0 / 256.
#define GCONS5 -63.0 / 256.
#elif ORDER == 12
#define GCONS0 3003.0 / 1024.
#define GCONS1 -3003.0 / 512.
#define GCONS2 9009.0 / 1024.
#define GCONS3 -2145.0 / 256.
#define GCONS4 5005.0 / 1024.
#define GCONS5 -819.0 / 512.
#define GCONS6 231.0 / 1024.
#endif

#if ORDER == 4
#define DGCONS0 -5.0 / 2.
#define DGCONS1 3.0 / 2.
#elif ORDER == 6
#define DGCONS0 -35.0 / 8.
#define DGCONS1 21.0 / 4.
#define DGCONS2 -15.0 / 8.
#elif ORDER == 8
#define DGCONS0 -105.0 / 16.
#define DGCONS1 189.0 / 16.
#define DGCONS2 -135.0 / 16.
#define DGCONS3 35.0 / 16.
#elif ORDER == 10
#define DGCONS0 -1155.0 / 128.
#define DGCONS1 693.0 / 32.
#define DGCONS2 -1485.0 / 64.
#define DGCONS3 385.0 / 32.
#define DGCONS4 -315.0 / 128.
#elif ORDER == 12
#define DGCONS0 -3003.0 / 256.
#define DGCONS1 9009.0 / 256.
#define DGCONS2 -6435.0 / 128.
#define DGCONS3 5005.0 / 128.
#define DGCONS4 -4095.0 / 256.
#define DGCONS5 693.0 / 256.
#endif

void CAT(nonbonded_force_vec_lj_msm_cpe, ORDER)(cpe_nonbonded_param_t *pm) {
  lwpf_enter(NONB);
  lwpf_start(INIT);
  dma_init();
  cpe_nonbonded_param_t lpm;
  pe_get(pm, &lpm, sizeof(cpe_nonbonded_param_t));
  dma_syn();
  doublev4 ncut = 0;
  int ncompute = 0;
  nonbonded_param_t lp;
  cellgrid_t lgrid;
  mdstat_t lstat;
  pe_get(lpm.p, &lp, sizeof(nonbonded_param_t));
  pe_get(lpm.grid, &lgrid, sizeof(cellgrid_t));
  if (_MYID == 0)
    pe_get(lpm.stat, &lstat, sizeof(mdstat_t));
  dma_syn();

  sw_archdata_t archdata;
  pe_get(lgrid.arch_data, &archdata, sizeof(sw_archdata_t));
  dma_syn();
  vec<real> fi[CELL_CAP];
  for (int i = 0; i < CELL_CAP; i++) {
    vecset1(fi[i], 0);
  }

  // for (int i = _MYID; i < archdata.nreps; i += 64) {
  //   pe_put(archdata.freps[i].f, fi, sizeof(vec<real>) * CELL_CAP);
  // }
  // dma_syn();
  lwpf_stop(INIT);
  qthread_syn();
  lwpf_start(COMPUTE);
  // if (_MYID > 0) return;
  int nncell = hdcell(lgrid.nn, 0, 0, 0);
  // memset(&lstat, 0, sizeof(lstat));

  vreal f0v4 /* = 0.0*/;
  vreal f1v4 /* = 1.0*/;
  vreal f2v4 /* = 2.0*/;
  vreal f3v4 /* = 3.0*/;
  vreal f4v4;
  vreal f6v4 /* = 6.0*/;
  vreal f12v4;
  SET_VCONSTF(f0v4, 0.0);
  SET_VCONSTF(f1v4, 1.0);
  SET_VCONSTF(f2v4, 2.0);
  SET_VCONSTF(f3v4, 3.0);
  SET_VCONSTF(f4v4, 4.0);
  SET_VCONSTF(f6v4, 6.0);
  SET_VCONSTF(f12v4, 12.0);
  vreal gcons0v4;
  vreal gcons1v4;
  vreal gcons2v4;
  vreal gcons3v4;
  vreal gcons4v4;
  vreal gcons5v4;
  vreal gcons6v4;
  vreal dgcons0v4;
  vreal dgcons1v4;
  vreal dgcons2v4;
  vreal dgcons3v4;
  vreal dgcons4v4;
  vreal dgcons5v4;

  SET_VCONSTF(gcons0v4, GCONS0);
  SET_VCONSTF(gcons1v4, GCONS1);
  SET_VCONSTF(gcons2v4, GCONS2);
  SET_VCONSTF(dgcons0v4, DGCONS0);
  SET_VCONSTF(dgcons1v4, DGCONS1);
  #if ORDER > 4
  SET_VCONSTF(gcons3v4, GCONS3);
  SET_VCONSTF(dgcons2v4, DGCONS2);
  #endif
  #if ORDER > 6
  SET_VCONSTF(gcons4v4, GCONS4);
  SET_VCONSTF(dgcons3v4, DGCONS3);
  #endif
  #if ORDER > 8
  SET_VCONSTF(gcons5v4, GCONS5);
  SET_VCONSTF(dgcons4v4, DGCONS4);
  #endif
  #if ORDER > 10
  SET_VCONSTF(gcons6v4, GCONS6);
  SET_VCONSTF(dgcons5v4, DGCONS5);
  #endif
  // vreal masks[4];
  // masks[0] = simd_set_uint256(0.0, 0.0, 0.0, 0.0);
  // masks[1] = simd_set_uint256(0.0, 2.0, 2.0, 2.0);
  // masks[2] = simd_set_uint256(0.0, 0.0, 2.0, 2.0);
  // masks[3] = simd_set_uint256(0.0, 0.0, 0.0, 2.0);
  real eps_rf = 78.5;
  real eps_r = 1;
  real k_rf = (eps_rf - eps_r) / ((2 * eps_rf + eps_r) * lp.rcut * lp.rcut * lp.rcut);
  real c_rf = (3 * eps_rf) / ((2 * eps_rf + eps_r) * lp.rcut);

  /* this header file yields r2, scaled and excluded for computing nonbonded interactions with various methods*/
  /* if excluded particles also needs processing (e.g. subtracting long ranged coulomb), define NO_SKIP_EXCL before include*/
  vdw_param_t vdw_param[64];
  pe_get(lp.vdw_param, vdw_param, sizeof(vdw_param_t) * lp.ntypes);

  real rcut = lp.rcut;
  real inv_rcut = 1. / rcut;
  real rsw = lp.rsw;
  real rsw2 = rsw * rsw;
  real scale14 = lp.scale14;
  real coul_const = lp.coul_const;
  vreal scale14cv4;
  SET_VCONSTF(scale14cv4, 1.0 - scale14);
  //  = lp.vdw_param;

  real rcut2 = rcut * rcut;
  vreal rcut2v4 = rcut2;
  vreal rcutv4 = rcut;
  vreal rsw2v4 = rsw2;
  vreal inv_rcutv4 = f1v4 / rcutv4;
  vreal inv_rcut2v4 = f1v4 / rcut2v4;
  vreal denom_switchv4 = rcut2v4 - rsw2v4;
  vreal inv_denom_switchv4 = f1v4 / (denom_switchv4 * denom_switchv4 * denom_switchv4);
  int256 l1v4 = simd_set_int256(1, 1, 1, 1);

  int order = lp.order;
  vreal evdwlv4 = f0v4, ecoulv4 = f0v4;
  dma_syn();
  FOREACH_LOCAL_CELL(&lgrid, ii, jj, kk, icell) {
    int cellid = getcid(&lgrid, ii, jj, kk);
    if (!(archdata.pe_range[_MYID].st <= cellid && archdata.pe_range[_MYID].ed > cellid))
      continue;

    long cpe_mask = (1L << _MYID) - 1;
    cellmeta_t imeta;
    lwpf_start(DMA_RI);
    pe_get(&(icell->basis), &imeta, sizeof(cellmeta_t));
    dma_syn();
    vec<real> xi[CELL_CAP];
    // vec<real> fi[CELL_CAP];
    real qi[CELL_CAP];
    int ti[CELL_CAP];
    int first_scal_cell[MAX_NEIGH];
    int first_excl_cell[MAX_NEIGH];
    int excl_id[MAX_SCAL_CELL][2];
    int scal_id[MAX_EXCL_CELL][2];
    pe_get(icell->x, xi, sizeof(vec<real>) * imeta.natom);
    // pe_get(icell->f, fi, sizeof(vec<real>) * imeta.natom);
    pe_get(icell->t, ti, sizeof(int) * imeta.natom);
    pe_get(icell->q, qi, sizeof(real) * imeta.natom);
    pe_get(icell->first_excl_cell, first_excl_cell, sizeof(int) * MAX_NEIGH);
    pe_get(icell->first_scal_cell, first_scal_cell, sizeof(int) * MAX_NEIGH);

    dma_syn();
    int nexcl = first_excl_cell[nncell + 1];
    int nscal = first_scal_cell[nncell + 1];
    if (nexcl)
      pe_get(icell->excl_id, excl_id, sizeof(int) * 2 * nexcl);
    if (nscal)
      pe_get(icell->scal_id, scal_id, sizeof(int) * 2 * nscal);
    dma_syn();
    lwpf_stop(DMA_RI);
    vec<real> oi, hi;
    cell2box(&oi, &hi, xi, imeta.natom);
    real xit[CELL_CAP], yit[CELL_CAP], zit[CELL_CAP];
    real fxit[CELL_CAP], fyit[CELL_CAP], fzit[CELL_CAP];
    real epsi[CELL_CAP], sigi[CELL_CAP], epsi14[CELL_CAP], sigi14[CELL_CAP];
    for (int i = 0; i < imeta.natom; i++) {
      epsi[i] = vdw_param[ti[i]].eps;
      sigi[i] = vdw_param[ti[i]].sig;
      epsi14[i] = vdw_param[ti[i]].eps14;
      sigi14[i] = vdw_param[ti[i]].sig14;
      xit[i] = xi[i].x;
      yit[i] = xi[i].y;
      zit[i] = xi[i].z;
      fxit[i] = 0;
      fyit[i] = 0;
      fzit[i] = 0;
    }

    FOREACH_NEIGHBOR_HALF_SHELL(&lgrid, ii, jj, kk, dx, dy, dz, jcell) {
      cellmeta_t jmeta;
      /*this is a marker that indicates the most recent i atom has excl/scal with index j*/
      long is_excl[CELL_CAP];
      long is_scal[CELL_CAP];
      real is_masked[CELL_CAP];
      lwpf_start(DMA_RJ);

      pe_get(&(jcell->basis), &jmeta, sizeof(cellmeta_t));
      dma_syn();
      int far = abs(dx) + abs(dy) + abs(dz) > 1;
      int jcell_id = jcell - lgrid.cells;
      int did = hdcell(lgrid.nn, dx, dy, dz);
      vec<real> dcell;
      vecsubv(dcell, jmeta.basis, imeta.basis);
      for (int i = 0; i < imeta.natom; i++) {
        is_excl[i] = 0x3fffffff;
        is_scal[i] = 0x3fffffff;
        is_masked[i] = 0;
      }
      int ie = first_excl_cell[did];
      int is = first_scal_cell[did];

      vec<real> xj[CELL_CAP];
      vec<real> fj[CELL_CAP];
      int tj[CELL_CAP];
      real qj[CELL_CAP];
      pe_get(jcell->x, xj, sizeof(vec<real>) * jmeta.natom);
      dma_syn();
      pe_get(jcell->t, tj, sizeof(int) * jmeta.natom);
      pe_get(jcell->q, qj, sizeof(real) * jmeta.natom);
      int irep = __builtin_popcountl(jmeta.pe_mask & cpe_mask);
      if (irep == 0) {
        pe_get(jcell->f, fj, sizeof(vec<real>) * jmeta.natom);
      } else {
        if (imeta.rep_init_mask & (1L << did)) {
          for (int j = 0; j < jmeta.natom; j++) {
            vecset1(fj[j], 0);
          }
        } else {
          pe_get(archdata.freps[jmeta.first_frep + irep - 1].f, fj, sizeof(vec<real>) * jmeta.natom);
        }
      }
      vreal dcellxv4 = dcell.x, dcellyv4 = dcell.y, dcellzv4 = dcell.z;
      long jskip[CELL_CAP], nj = 0;
      if (far) {
        vreal oxv4 = oi.x - dcell.x;
        vreal oyv4 = oi.y - dcell.y;
        vreal ozv4 = oi.z - dcell.z;
        vreal hxv4 = hi.x;
        vreal hyv4 = hi.y;
        vreal hzv4 = hi.z;
        for (int j = 0; j < jmeta.natom; j += 4) {
          vreal xv4, yv4, zv4;
          simd_load_xyzvec(xv4, yv4, zv4, (real *)(xj + j));
          vreal dist2v4 = vvdist2_atom_box(xv4, yv4, zv4, oxv4, oyv4, ozv4, hxv4, hyv4, hzv4);
          vreal skipv4 = simd_vfcmplt(rcut2v4, dist2v4);
          simd_store(skipv4, (real*)jskip + j);
        }
      } else {
        for (int j = 0; j < jmeta.natom; j++) {
          jskip[j] = 0;
        }
      }
      
      dma_syn();

      lwpf_stop(DMA_RJ);
      lwpf_start(JLOOP);
      for (int j = 0; j < jmeta.natom; j++) {
        // int j = jskip[jx];
        if (jskip[j])
          continue;
        vec<real> xneigh;
        vecaddv(xneigh, dcell, xj[j]);

        
        // lwpf_start(SECT_GLD);
        while (ie < first_excl_cell[did + 1] && excl_id[ie][1] <= j) {
          is_excl[excl_id[ie][0]] = excl_id[ie][1];
          ie++;
        }
        while (is < first_scal_cell[did + 1] && scal_id[is][1] <= j) {
          is_scal[scal_id[is][0]] = scal_id[is][1];
          is++;
        }
        // lwpf_stop(SECT_GLD);
        int ni = imeta.natom;
        if (icell == jcell) {
          ni = j;
        }
        for (int i = ni; i < ni + VEC_MASK; i++) {
          is_masked[i] = 1.0;
        }
        int256 jv4 = simd_set_int256(j, j, j, j);
        vreal xjv4 = xneigh.x;
        vreal yjv4 = xneigh.y;
        vreal zjv4 = xneigh.z;
        vreal qjv4 = qj[j];
        vreal sigjv4 = vdw_param[tj[j]].sig;
        vreal epsjv4 = vdw_param[tj[j]].eps;
        vreal sigj14v4 = vdw_param[tj[j]].sig14;
        vreal epsj14v4 = vdw_param[tj[j]].eps14;
        vreal fxjv4 = f0v4, fyjv4 = f0v4, fzjv4 = f0v4;
        lwpf_start(ILOOP);
        for (int i = 0; i < ni; i += VEC_WIDTH) {
          vreal exclv4, scalv4;
          vreal iexclv4, iscalv4;
          asm volatile(
              "vldd %0,%2\n\t"
              "vldd %1,%3\n\t"
              "vsubl %0,%4,%0\n\t"
              "vsubl %1,%4,%1\n\t"
              : "=r"(iexclv4), "=r"(iscalv4)
              : "m"(is_excl[i]), "m"(is_scal[i]), "r"(jv4));
          exclv4 = simd_vseleq(iexclv4, f0v4, f1v4);
          scalv4 = simd_vseleq(iscalv4, f0v4, f1v4);
          // iscalv4 = (int256)scalv4;
          vreal xiv4, yiv4, ziv4;
          simd_load(xiv4, xit + i);
          simd_load(yiv4, yit + i);
          simd_load(ziv4, zit + i);
          vreal dxv4 = xiv4 - xjv4;
          vreal dyv4 = yiv4 - yjv4;
          vreal dzv4 = ziv4 - zjv4;
          vreal r2v4 = dxv4 * dxv4 + dyv4 * dyv4 + dzv4 * dzv4;
          vreal ltcutv4 = simd_vfcmplt(r2v4, rcut2v4);
          vreal maskv4;
          simd_load(maskv4, is_masked + i);
          ltcutv4 = simd_vseleq(maskv4, ltcutv4, f0v4);
          // ltcutv4 = simd_vseleq(exclv4, f0v4, ltcutv4);
          int has_ltcut;
          has_ftrue(has_ltcut, ltcutv4);


          if (has_ltcut) {

            vreal inv_r2v4 = f1v4 / r2v4;
            vreal inv_rv4 = simd_vsqrt(inv_r2v4);

            vreal rv4 = r2v4 * inv_rv4;

            vreal qiv4;
            simd_load(qiv4, qi + i);
            vreal kqqv4 = coul_const * qiv4 * qjv4;
            vreal dwdrv4, dudrv4, wv4, uv4;

            vreal sigi14v4, epsi14v4, sigiv4, epsiv4;
            simd_load(sigi14v4, sigi14 + i);
            simd_load(sigiv4, sigi + i);
            simd_load(epsi14v4, epsi14 + i);
            simd_load(epsiv4, epsi + i);

            vreal epsi_usev4 = simd_vseleq(scalv4, epsi14v4, epsiv4);
            vreal epsj_usev4 = simd_vseleq(scalv4, epsj14v4, epsjv4);
            vreal sigi_usev4 = simd_vseleq(scalv4, sigi14v4, sigiv4);
            vreal sigj_usev4 = simd_vseleq(scalv4, sigj14v4, sigjv4);

            vreal epsv4 = epsi_usev4 * epsj_usev4; //simd_vseleq(scalv4, eps14v4, epsxxv4);
            vreal sigv4 = sigi_usev4 + sigj_usev4; //simd_vseleq(scalv4, sig14v4, sigxxv4);

            vreal sig_r2v4 = sigv4 * sigv4 * inv_r2v4;
            vreal sig_r6v4 = sig_r2v4 * sig_r2v4 * sig_r2v4;
            vreal sig_r12v4 = sig_r6v4 * sig_r6v4;
            wv4 = (sig_r12v4 - f2v4 * sig_r6v4) * epsv4;

            // wv4 = simd_vnmad(f2v4, sig_r6v4, sig_r12v4) * epsv4;
            dwdrv4 = f12v4 * (sig_r6v4 - sig_r12v4) * epsv4 * inv_r2v4;
            // w = (sig_r12 - 2 * sig_r6) * eps;
            // dwdr = 12 * (sig_r6 - sig_r12) * eps * inv_r2;
            // lwpf_start(SECT_GLD);
            vreal gtswv4 = simd_vfcmpgt(r2v4, rsw2v4);
            int has_gtsw;
            has_ftrue(has_gtsw, gtswv4);
            
            if (has_gtsw) {
              vreal sv4, dsdrv4;
              sv4 = (rcut2v4 - r2v4) * (rcut2v4 - r2v4) * (rcut2v4 + f2v4 * r2v4 - f3v4 * rsw2v4) * inv_denom_switchv4;
              dsdrv4 = f12v4 * (rcut2v4 - r2v4) * (rsw2v4 - r2v4) * inv_denom_switchv4;

              sv4 = simd_vselgt(gtswv4, sv4, f1v4);
              dsdrv4 = simd_vselgt(gtswv4, dsdrv4, f0v4);
              dwdrv4 = wv4 * dsdrv4 + dwdrv4 * sv4;
              wv4 = wv4 * sv4;
            }
            //  else {
            //   dwdrv4 = dw0drv4;
            //   wv4 = w0v4;
            // }
            // lwpf_stop(SECT_GLD);

            dwdrv4 = -dwdrv4;
            wv4 = simd_vseleq(exclv4, f0v4, wv4);
            wv4 = simd_vselgt(ltcutv4, wv4, f0v4);
            dwdrv4 = simd_vseleq(exclv4, f0v4, dwdrv4);


            vreal rhov4 = rv4 * inv_rcutv4;
            vreal rho2v4 = r2v4 * inv_rcut2v4;
            vreal egammav4 = f1v4 - rhov4 * CAT(GAMMA, ORDER)(rho2v4);
            vreal fgammav4 = f1v4 + rho2v4 * CAT(DGAMMA, ORDER)(rhov4, rho2v4);
            vreal u_origv4 = kqqv4 * inv_rv4;
            vreal dudr_origv4 = u_origv4 * inv_r2v4;
            vreal coul_excl_factorv4 = simd_vseleq(scalv4, scale14cv4, f0v4);
            coul_excl_factorv4 = simd_vseleq(exclv4, f1v4, coul_excl_factorv4);

            uv4 = u_origv4 * (egammav4 - coul_excl_factorv4);
            dudrv4 = dudr_origv4 * (fgammav4 - coul_excl_factorv4);
            uv4 = simd_vselgt(ltcutv4, uv4, f0v4);

            vreal fpairv4 = dwdrv4 + dudrv4;

            fpairv4 = simd_vselgt(ltcutv4, fpairv4, f0v4);
            vreal fxiv4, fyiv4, fziv4;
            simd_load(fxiv4, fxit + i);
            simd_load(fyiv4, fyit + i);
            simd_load(fziv4, fzit + i);

            fxjv4 -= fpairv4 * dxv4;
            fyjv4 -= fpairv4 * dyv4;
            fzjv4 -= fpairv4 * dzv4;

            fxiv4 += fpairv4 * dxv4;
            fyiv4 += fpairv4 * dyv4;
            fziv4 += fpairv4 * dzv4;

            simd_store(fxiv4, fxit + i);
            simd_store(fyiv4, fyit + i);
            simd_store(fziv4, fzit + i);
            // vecscaleaddv(fi[i], fi[i], datom, 1, fpair);
            // vecscaleaddv(fj[j], fj[j], datom, 1, -fpair);

            evdwlv4 += wv4;
            ecoulv4 += uv4;
          }
        }
        for (int i = ni; i < ni + VEC_MASK; i++) {
          is_masked[i] = 0;
        }
        lwpf_stop(ILOOP);
        vec<real> fjj;
        simd_vsum(fjj.x, fxjv4);
        simd_vsum(fjj.y, fyjv4);
        simd_vsum(fjj.z, fzjv4);
        vecaddv(fj[j], fj[j], fjj);
      }
      lwpf_stop(JLOOP);
      if (icell == jcell) {
        for (int i = 0; i < jmeta.natom; i++) {
          fj[i].x += fxit[i];
          fj[i].y += fyit[i];
          fj[i].z += fzit[i];
        }
      }
      lwpf_start(DMA_WJ);
      if (irep == 0) {
        pe_put(jcell->f, fj, sizeof(vec<real>) * jmeta.natom);
      } else {
        // if (jmeta.first_frep + irep - 1 == 1231) printf("update 1231: %d %d %d %d\n", _MYID, ii, jj, kk);
        pe_put(archdata.freps[jmeta.first_frep + irep - 1].f, fj, sizeof(vec<real>) * jmeta.natom);
      }
      dma_syn();
      lwpf_stop(DMA_WJ);
    }
    // asm volatile("halt");
  }
  lwpf_stop(COMPUTE);
  qthread_syn();

  lwpf_start(FINI);
  FOREACH_CELL(&lgrid, ii, jj, kk, cell) {
    if (getpe_all(&lgrid, ii, jj, kk) != _MYID)
      continue;
    cellmeta_t meta;
    pe_get(&(cell->basis), &meta, sizeof(cellmeta_t));
    dma_syn();
    vec<real> f[CELL_CAP];
    pe_get(cell->f, f, sizeof(vec<real>) * meta.natom);
    dma_syn();
    int nrep_cell = __builtin_popcountl(meta.pe_mask) - 1;
    for (int irep = 0; irep < nrep_cell; irep++) {
      vec<real> frep[CELL_CAP];
      pe_get(archdata.freps[meta.first_frep + irep].f, frep, sizeof(vec<real>) * meta.natom);
      dma_syn();
      for (int i = 0; i < meta.natom; i++) {
        vecaddv(f[i], f[i], frep[i]);
      }
    }
    pe_put(cell->f, f, sizeof(vec<real>) * meta.natom);
    dma_syn();
  }

  double energy[4];
  // if (!_MYID) {
  //   simd_print_doublev4(evdwlv4);
  //   simd_print_doublev4(ecoulv4);
  // }
  // cal_global_lock();
  // printf("%d\n", _MYID);
  // simd_print_doublev4(ecoulv4);
  // cal_global_unlock();
  simd_vsum(energy[0], evdwlv4);
  simd_vsum(energy[1], ecoulv4);
  // cal_locked_printf("%d %f\n", _MYID, energy[1]);
  // energy = simd_vinsf0(energy, evdwl);
  // energy = simd_vinsf1(energy, ecoul);
  // if (!_MYID) printf("%f %f\n", energy[0], energy[1]);
  reg_reduce_inplace_doublev4((doublev4*)energy, 1);
  // if (!_MYID) printf("%f %f\n", energy[0], energy[1]);
  if (_MYID == 0) {
    lstat.evdwl += energy[0];
    lstat.ecoul += energy[1];
    pe_put(lpm.stat, &lstat, sizeof(mdstat_t));
    dma_syn();
  }
  // if (_MYID == 0){
  //   double ncut_sum;
  //   simd_vsumd(ncut_sum, ncut);
  //   printf("%d %f\n", ncompute, ncut_sum);
  // }
  // simd_print_doublev4(energy);
  lwpf_stop(FINI);
  lwpf_exit(NONB);
}
#undef __CAT__
#undef CAT
#if ORDER == 4
#undef GCONS0
#undef GCONS1
#undef GCONS2
#elif ORDER == 6
#undef GCONS0
#undef GCONS1
#undef GCONS2
#undef GCONS3
#elif ORDER == 8
#undef GCONS0
#undef GCONS1
#undef GCONS2
#undef GCONS3
#undef GCONS4
#elif ORDER == 10
#undef GCONS0
#undef GCONS1
#undef GCONS2
#undef GCONS3
#undef GCONS4
#undef GCONS5
#elif ORDER == 12
#undef GCONS0
#undef GCONS1
#undef GCONS2
#undef GCONS3
#undef GCONS4
#undef GCONS5
#undef GCONS6
#endif

#if ORDER == 4
#undef DGCONS0
#undef DGCONS1
#elif ORDER == 6
#undef DGCONS0
#undef DGCONS1
#undef DGCONS2
#elif ORDER == 8
#undef DGCONS0
#undef DGCONS1
#undef DGCONS2
#undef DGCONS3
#elif ORDER == 10
#undef DGCONS0
#undef DGCONS1
#undef DGCONS2
#undef DGCONS3
#undef DGCONS4
#elif ORDER == 12
#undef DGCONS0
#undef DGCONS1
#undef DGCONS2
#undef DGCONS3
#undef DGCONS4
#undef DGCONS5
#endif

